NS-Forest workflow¶

In [1]:
import sys
import os
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

#CODE_PATH = "/Users/zhangy71/NSForest" # location of NSForest folder
CODE_PATH = "/home/jovyan/session_data/NSForest" # location of NSForest in cloudos.lifebit.ai

sys.path.insert(0, os.path.abspath(CODE_PATH))
from nsforest import ns, nsforesting, utils, NSFOREST_VERSION
/mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_csv from `anndata` is deprecated. Import anndata.io.read_csv instead.
  warnings.warn(msg, FutureWarning)
/mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_text from `anndata` is deprecated. Import anndata.io.read_text instead.
  warnings.warn(msg, FutureWarning)
/mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_excel from `anndata` is deprecated. Import anndata.io.read_excel instead.
  warnings.warn(msg, FutureWarning)
/mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_mtx from `anndata` is deprecated. Import anndata.io.read_mtx instead.
  warnings.warn(msg, FutureWarning)
/mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_loom from `anndata` is deprecated. Import anndata.io.read_loom instead.
  warnings.warn(msg, FutureWarning)
/mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_hdf from `anndata` is deprecated. Import anndata.io.read_hdf instead.
  warnings.warn(msg, FutureWarning)
/mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_csv from `anndata` is deprecated. Import anndata.io.read_csv instead.
  warnings.warn(msg, FutureWarning)
/mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_excel from `anndata` is deprecated. Import anndata.io.read_excel instead.
  warnings.warn(msg, FutureWarning)
/mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_hdf from `anndata` is deprecated. Import anndata.io.read_hdf instead.
  warnings.warn(msg, FutureWarning)
/mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_loom from `anndata` is deprecated. Import anndata.io.read_loom instead.
  warnings.warn(msg, FutureWarning)
/mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_mtx from `anndata` is deprecated. Import anndata.io.read_mtx instead.
  warnings.warn(msg, FutureWarning)
/mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_text from `anndata` is deprecated. Import anndata.io.read_text instead.
  warnings.warn(msg, FutureWarning)
/mnt/libraries/envs/nsforest/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_umi_tools from `anndata` is deprecated. Import anndata.io.read_umi_tools instead.
  warnings.warn(msg, FutureWarning)
In [2]:
pd.set_option('display.max_rows', None)

0. Set up¶

In [3]:
## set up
organ = "kidney" #<---
author = "Lake" #<---
year = "2023" #<---
output_folder = "outputs_" + organ + "_" + author + "_" + year + "/" #e.g., "outputs_kidney_Lake_2023/"

cluster_header = "subclass.full" #<---
outputfilename_suffix = cluster_header
outputfilename_prefix = cluster_header 

1. Data¶

[need to filter normal cells -- Anne]¶

In [4]:
#data_folder = "/Users/zhangy71/Documents/Kidney-2025/Data/Lake-KPMP-2023/" #<--- 
# running on cloudos.lifebit.ai
data_folder = "/home/jovyan/session_data/mounted-data-readonly/" 
In [5]:
adata = sc.read_h5ad(data_folder + "adata_normal_n3566.h5ad") #<---
In [6]:
adata
Out[6]:
AnnData object with n_obs × n_vars = 3566 × 33826
    obs: 'nCount_RNA', 'nFeature_RNA', 'library', 'percent.er', 'percent.mt', 'degen.score', 'aEpi.score', 'aStr.score', 'cyc.score', 'matrisome.score', 'collagen.score', 'glycoprotein.score', 'proteoglycan.score', 'S.Score', 'G2M.Score', 'experiment', 'specimen', 'condition.long', 'condition.l1', 'condition.l2', 'donor_id', 'region.l1', 'region.l2', 'percent.cortex', 'percent.medulla', 'sample_tissue_type', 'id', 'pagoda_k100_infomap_coembed', 'subclass.full', 'subclass.l3', 'subclass.l2', 'subclass.l1', 'state.l2', 'state', 'class', 'structure', 'disease_ontology_term_id', 'sex_ontology_term_id', 'development_stage_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'eGFR', 'BMI', 'diabetes_history', 'hypertension', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'is_primary_data', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'citation', 'organism', 'organism_ontology_term_id', 'schema_reference', 'schema_version', 'title'
    obsm: 'X_umap'

2. Clusters¶

number of clusters¶

In [7]:
## number of clusters
n_clusters = adata.obs[cluster_header].nunique()
n_clusters
Out[7]:
75

dendrogram (run this to automatically create the output folder)¶

In [8]:
## auto-adjust figsize
fig_width = int(n_clusters/5)
fig_height = max([2, int(max([len(z) for z in adata.obs[cluster_header].unique()]) / 30) + 1])
In [9]:
## dendrogram and save svg
ns.pp.dendrogram(adata, cluster_header, figsize = (fig_width, fig_height), tl_kwargs = {'optimal_ordering': True},
                 save = "svg", output_folder = output_folder, outputfilename_suffix = outputfilename_suffix)
WARNING: You’re trying to run this on 33826 dimensions of `.X`, if you really want this, set `use_rep='X'`.
         Falling back to preprocessing with `sc.pp.pca` and default params.
WARNING: saving figure to file outputs_kidney_Lake_2023/dendrogram_subclass.full.svg
No description has been provided for this image

cluster sizes¶

In [10]:
## cluster sizes
df_cluster_sizes = pd.DataFrame(adata.obs[cluster_header].value_counts())
df_cluster_sizes
Out[10]:
count
subclass.full
Adaptive / Maladaptive / Repairing Fibroblast 50
Adaptive / Maladaptive / Repairing Proximal Tubule Epithelial Cell 50
Mesangial Cell 50
Medullary Thick Ascending Limb Cell 50
Medullary Fibroblast 50
Mast Cell 50
Macula Densa Cell 50
M2 Macrophage 50
Lymphatic Endothelial Cell 50
Intercalated Cell Type B 50
Inner Medullary Collecting Duct Cell 50
Glomerular Capillary Endothelial Cell 50
Fibroblast 50
Distal Convoluted Tubule Cell Type 2 50
Distal Convoluted Tubule Cell Type 1 50
Descending Vasa Recta Endothelial Cell 50
Descending Thin Limb Cell Type 3 50
Monocyte-derived Cell 50
Myofibroblast 50
Natural Killer Cell / Natural Killer T Cell 50
Podocyte 50
Vascular Smooth Muscle Cell 50
Transitional Principal-Intercalated Cell 50
T Cell 50
Renin-positive Juxtaglomerular Granular Cell 50
Proximal Tubule Epithelial Cell Segment 3 50
Proximal Tubule Epithelial Cell Segment 1 / Segment 2 50
Plasma Cell 50
Neutrophil 50
Peritubular Capilary Endothelial Cell 50
Parietal Epithelial Cell 50
Papillary Tip Epithelial Cell 50
Outer Medullary Collecting Duct Principal Cell 50
Outer Medullary Collecting Duct Intercalated Cell Type A 50
Non-classical Monocyte 50
Descending Thin Limb Cell Type 2 50
Descending Thin Limb Cell Type 1 50
Degenerative Vascular Smooth Muscle Cell 50
Cycling Proximal Tubule Epithelial Cell 50
Adaptive / Maladaptive / Repairing Thick Ascending Limb Cell 50
Afferent / Efferent Arteriole Endothelial Cell 50
Ascending Thin Limb Cell 50
Ascending Vasa Recta Endothelial Cell 50
B Cell 50
Classical Dendritic Cell 50
Connecting Tubule Cell 50
Connecting Tubule Intercalated Cell Type A 50
Connecting Tubule Principal Cell 50
Cortical Collecting Duct Intercalated Cell Type A 50
Cortical Collecting Duct Principal Cell 50
Cortical Thick Ascending Limb Cell 50
Cycling Endothelial Cell 50
Degenerative Proximal Tubule Epithelial Cell 50
Vascular Smooth Muscle Cell / Pericyte 50
Degenerative Endothelial Cell 50
Degenerative Medullary Fibroblast 50
Degenerative Ascending Thin Limb Cell 50
Degenerative Connecting Tubule Cell 50
Degenerative Podocyte 50
Degenerative Peritubular Capilary Endothelial Cell 50
Degenerative Cortical Intercalated Cell Type A 50
Degenerative Cortical Thick Ascending Limb Cell 50
Degenerative Descending Thin Limb Cell Type 3 50
Degenerative Distal Convoluted Tubule Cell 50
Degenerative Outer Medullary Collecting Duct Principal Cell 50
Degenerative Medullary Thick Ascending Limb Cell 50
Degenerative Fibroblast 50
Degenerative Inner Medullary Collecting Duct Cell 50
Cycling Mononuclear Phagocyte 48
Schwann Cell / Neural 35
Plasmacytoid Dendritic Cell 31
Cycling Myofibroblast 18
Cycling Natural Killer Cell / Natural Killer T Cell 16
Cycling Connecting Tubule Cell 12
Cycling Distal Convoluted Tubule Cell 6
In [11]:
## save
df_cluster_sizes.to_csv(output_folder + outputfilename_prefix + "_cluster_sizes.csv")

cluster order¶

In [12]:
cluster_order = [x.strip() for x in adata.uns["dendrogram_" + cluster_header]['categories_ordered']]
In [13]:
## save
pd.DataFrame({'cluster_order': cluster_order}).to_csv(output_folder + outputfilename_prefix + "_cluster_order.csv", index=False)

summary statistics of data (normal cells)¶

In [14]:
df_normal = pd.DataFrame({'n_obs': [adata.n_obs], 'n_vars': [adata.n_vars], 'n_clusters': [n_clusters]})
df_normal
Out[14]:
n_obs n_vars n_clusters
0 3566 33826 75
In [15]:
## save
df_normal.to_csv(output_folder + outputfilename_prefix + "_summary_normal.csv", index=False)

3. NS-Forest¶

prep¶

In [16]:
## make a copy b/c the median step will only keep the positive genes
## keep the original data for plotting
adata_prep = adata.copy()
In [17]:
## get medians
adata_prep = ns.pp.prep_medians(adata_prep, cluster_header)
Calculating medians...
Calculating medians (means) per cluster: 100%|██████████| 75/75 [00:07<00:00,  9.90it/s]
Saving calculated medians as adata.varm.medians_subclass.full
--- 7.585983753204346 seconds ---
median: 0.0
mean: 0.021279225
std: 0.18309134
Only positive genes selected. 5607 positive genes out of 33826 total genes
In [18]:
## get binary scores
adata_prep = ns.pp.prep_binary_scores(adata_prep, cluster_header)
Calculating binary scores...
Calculating binary scores per cluster: 100%|██████████| 75/75 [01:55<00:00,  1.54s/it]
Saving calculated binary scores as adata.varm.binary_scores_subclass.full
--- 115.82460403442383 seconds ---
median: 0.0
mean: 0.08331220064118282
std: 0.24597213799454656

In [19]:
## check medians
df_medians = adata_prep.varm['medians_' + cluster_header]
print(df_medians.shape)
df_medians.head()
(5607, 75)
Out[19]:
Adaptive / Maladaptive / Repairing Fibroblast Adaptive / Maladaptive / Repairing Proximal Tubule Epithelial Cell Adaptive / Maladaptive / Repairing Thick Ascending Limb Cell Afferent / Efferent Arteriole Endothelial Cell Ascending Thin Limb Cell Ascending Vasa Recta Endothelial Cell B Cell Classical Dendritic Cell Connecting Tubule Cell Connecting Tubule Intercalated Cell Type A ... Plasmacytoid Dendritic Cell Podocyte Proximal Tubule Epithelial Cell Segment 1 / Segment 2 Proximal Tubule Epithelial Cell Segment 3 Renin-positive Juxtaglomerular Granular Cell Schwann Cell / Neural T Cell Transitional Principal-Intercalated Cell Vascular Smooth Muscle Cell Vascular Smooth Muscle Cell / Pericyte
ENSG00000175899 0.0 0.000000 0.000000 2.1501 0.000000 0.0 0.0 0.0 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.985069 0.0 0.000000 0.000000 0.0 1.36513
ENSG00000128274 0.0 0.000000 0.000000 0.0000 0.000000 0.0 0.0 0.0 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.0 0.00000
ENSG00000103591 0.0 0.000000 0.000000 0.0000 0.000000 0.0 0.0 0.0 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.0 0.00000
ENSG00000115977 0.0 1.092519 1.096968 0.0000 1.399926 0.0 0.0 0.0 1.269087 1.171837 ... 0.0 0.0 0.0 0.0 0.000000 0.0 0.595464 0.931235 0.0 0.00000
ENSG00000157426 0.0 0.000000 0.000000 0.0000 0.000000 0.0 0.0 0.0 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.0 0.00000

5 rows × 75 columns

In [20]:
## check binary scores
df_binary_scores = adata_prep.varm['binary_scores_' + cluster_header]
print(df_binary_scores.shape)
df_binary_scores.head()
(5607, 75)
Out[20]:
Adaptive / Maladaptive / Repairing Fibroblast Adaptive / Maladaptive / Repairing Proximal Tubule Epithelial Cell Adaptive / Maladaptive / Repairing Thick Ascending Limb Cell Afferent / Efferent Arteriole Endothelial Cell Ascending Thin Limb Cell Ascending Vasa Recta Endothelial Cell B Cell Classical Dendritic Cell Connecting Tubule Cell Connecting Tubule Intercalated Cell Type A ... Plasmacytoid Dendritic Cell Podocyte Proximal Tubule Epithelial Cell Segment 1 / Segment 2 Proximal Tubule Epithelial Cell Segment 3 Renin-positive Juxtaglomerular Granular Cell Schwann Cell / Neural T Cell Transitional Principal-Intercalated Cell Vascular Smooth Muscle Cell Vascular Smooth Muscle Cell / Pericyte
ENSG00000175899 0.0 0.000000 0.000000 0.936407 0.000000 0.0 0.0 0.0 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.894096 0.0 0.000000 0.000000 0.0 0.902851
ENSG00000128274 0.0 0.000000 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.0 0.000000
ENSG00000103591 0.0 0.000000 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.0 0.000000
ENSG00000115977 0.0 0.629325 0.629896 0.000000 0.674722 0.0 0.0 0.0 0.654184 0.639728 ... 0.0 0.0 0.0 0.0 0.000000 0.0 0.597169 0.615731 0.0 0.000000
ENSG00000157426 0.0 0.000000 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.000000 0.0 0.000000 0.000000 0.0 0.000000

5 rows × 75 columns

In [21]:
## save csv and pkl
df_medians.to_csv(output_folder + outputfilename_prefix + "_medians.csv")
df_medians.to_pickle(output_folder + outputfilename_prefix + "_medians.pkl")

df_binary_scores.to_csv(output_folder + outputfilename_prefix + "_binary_scores.csv")
df_binary_scores.to_pickle(output_folder + outputfilename_prefix + "_binary_scores.pkl")

histograms of non-zero values [TO-DO: nice to have them as functions -- Beverly]¶

In [22]:
non_zero_medians = df_medians[df_medians != 0].stack().values
In [23]:
plt.hist(non_zero_medians, bins=100)
plt.title("Non-zero medians")

plt.savefig(output_folder + "hist_nonzero_medians_" + outputfilename_suffix + ".svg")
plt.show()
No description has been provided for this image
In [24]:
non_zero_binary_scores = df_binary_scores[df_binary_scores != 0].stack().values
In [25]:
plt.hist(non_zero_binary_scores, bins=100)
plt.title("Non-zero binary scores")

plt.savefig(output_folder + "hist_nonzero_binary_scores_" + outputfilename_suffix + ".svg")
plt.show()
No description has been provided for this image

run NSForest()¶

In [26]:
results = nsforesting.NSForest(adata_prep, cluster_header, save = True, save_supplementary = True,
                               output_folder = output_folder, outputfilename_prefix = outputfilename_prefix)
Running NS-Forest version 4.1
Preparing adata...
--- 0.027405738830566406 seconds ---
Pre-selecting genes based on binary scores...
	BinaryFirst_high Threshold (mean + 2 * std): 0.5752564766302759
	Average number of genes after gene_selection in each cluster: 505.8933333333333
Saving number of genes selected per cluster as...
outputs_kidney_Lake_2023/subclass.full_gene_selection.csv
Number of clusters to evaluate: 75
1 out of 75:
	Adaptive / Maladaptive / Repairing Fibroblast
	Pre-selected 293 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000156218', 'ENSG00000077942', 'ENSG00000185070']
	  fbeta: 0.561
	  precision: 0.917
	  recall: 0.22
2 out of 75:
	Adaptive / Maladaptive / Repairing Proximal Tubule Epithelial Cell
	Pre-selected 786 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000135220', 'ENSG00000170579']
	  fbeta: 0.634
	  precision: 0.783
	  recall: 0.36
3 out of 75:
	Adaptive / Maladaptive / Repairing Thick Ascending Limb Cell
	Pre-selected 947 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000129682', 'ENSG00000115221', 'ENSG00000129151']
	  fbeta: 0.616
	  precision: 0.773
	  recall: 0.34
4 out of 75:
	Afferent / Efferent Arteriole Endothelial Cell
	Pre-selected 397 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000135919', 'ENSG00000131477']
	  fbeta: 0.761
	  precision: 0.955
	  recall: 0.42
5 out of 75:
	Ascending Thin Limb Cell
	Pre-selected 820 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000184374', 'ENSG00000275395']
	  fbeta: 0.616
	  precision: 0.773
	  recall: 0.34
6 out of 75:
	Ascending Vasa Recta Endothelial Cell
	Pre-selected 299 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000148488', 'ENSG00000283632']
	  fbeta: 0.731
	  precision: 0.95
	  recall: 0.38
7 out of 75:
	B Cell
	Pre-selected 44 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000156738']
	  fbeta: 0.773
	  precision: 0.833
	  recall: 0.6
8 out of 75:
	Classical Dendritic Cell
	Pre-selected 353 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000131203']
	  fbeta: 0.857
	  precision: 0.9
	  recall: 0.72
9 out of 75:
	Connecting Tubule Cell
	Pre-selected 363 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000251504', 'ENSG00000165685', 'ENSG00000226674']
	  fbeta: 0.448
	  precision: 0.571
	  recall: 0.24
10 out of 75:
	Connecting Tubule Intercalated Cell Type A
	Pre-selected 644 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000112530', 'ENSG00000145147']
	  fbeta: 0.325
	  precision: 0.385
	  recall: 0.2
11 out of 75:
	Connecting Tubule Principal Cell
	Pre-selected 653 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000159167', 'ENSG00000104327', 'ENSG00000204323']
	  fbeta: 0.556
	  precision: 0.737
	  recall: 0.28
12 out of 75:
	Cortical Collecting Duct Intercalated Cell Type A
	Pre-selected 376 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000278961', 'ENSG00000185274', 'ENSG00000145147']
	  fbeta: 0.591
	  precision: 0.867
	  recall: 0.26
13 out of 75:
	Cortical Collecting Duct Principal Cell
	Pre-selected 971 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000182752', 'ENSG00000184672']
	  fbeta: 0.563
	  precision: 0.696
	  recall: 0.32
14 out of 75:
	Cortical Thick Ascending Limb Cell
	Pre-selected 425 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000113946', 'ENSG00000036672']
	  fbeta: 0.534
	  precision: 0.594
	  recall: 0.38
15 out of 75:
	Cycling Connecting Tubule Cell
	Pre-selected 1128 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000102870', 'ENSG00000158805']
	  fbeta: 0.341
	  precision: 0.375
	  recall: 0.25
16 out of 75:
	Cycling Distal Convoluted Tubule Cell
	Pre-selected 1774 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000058404', 'ENSG00000066279']
	  fbeta: 0.714
	  precision: 1.0
	  recall: 0.333
17 out of 75:
	Cycling Endothelial Cell
	Pre-selected 290 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000178878', 'ENSG00000139734']
	  fbeta: 0.57
	  precision: 0.812
	  recall: 0.26
18 out of 75:
	Cycling Mononuclear Phagocyte
	Pre-selected 189 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000077420', 'ENSG00000101639', 'ENSG00000185811']
	  fbeta: 0.375
	  precision: 0.75
	  recall: 0.125
19 out of 75:
	Cycling Myofibroblast
	Pre-selected 768 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000143476', 'ENSG00000152402']
	  fbeta: 0.761
	  precision: 1.0
	  recall: 0.389
20 out of 75:
	Cycling Natural Killer Cell / Natural Killer T Cell
	Pre-selected 723 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000166803', 'ENSG00000183918']
	  fbeta: 0.893
	  precision: 1.0
	  recall: 0.625
21 out of 75:
	Cycling Proximal Tubule Epithelial Cell
	Pre-selected 275 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000165304', 'ENSG00000132780']
	  fbeta: 0.354
	  precision: 0.364
	  recall: 0.32
22 out of 75:
	Degenerative Ascending Thin Limb Cell
	Pre-selected 646 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000197249', 'ENSG00000140600']
	  fbeta: 0.593
	  precision: 0.639
	  recall: 0.46
23 out of 75:
	Degenerative Connecting Tubule Cell
	Pre-selected 644 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000162896', 'ENSG00000145536', 'ENSG00000104327']
	  fbeta: 0.5
	  precision: 0.9
	  recall: 0.18
24 out of 75:
	Degenerative Cortical Intercalated Cell Type A
	Pre-selected 908 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000130203', 'ENSG00000151418']
	  fbeta: 0.718
	  precision: 0.806
	  recall: 0.5
25 out of 75:
	Degenerative Cortical Thick Ascending Limb Cell
	Pre-selected 150 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000198431', 'ENSG00000124107']
	  fbeta: 0.636
	  precision: 0.882
	  recall: 0.3
26 out of 75:
	Degenerative Descending Thin Limb Cell Type 3
	Pre-selected 1607 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000133475', 'ENSG00000115641']
	  fbeta: 0.787
	  precision: 0.875
	  recall: 0.56
27 out of 75:
	Degenerative Distal Convoluted Tubule Cell
	Pre-selected 995 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000121769', 'ENSG00000178343']
	  fbeta: 0.742
	  precision: 0.818
	  recall: 0.54
28 out of 75:
	Degenerative Endothelial Cell
	Pre-selected 54 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000142089', 'ENSG00000184831']
	  fbeta: 0.426
	  precision: 0.727
	  recall: 0.16
29 out of 75:
	Degenerative Fibroblast
	Pre-selected 216 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000139329', 'ENSG00000142173']
	  fbeta: 0.812
	  precision: 0.962
	  recall: 0.5
30 out of 75:
	Degenerative Inner Medullary Collecting Duct Cell
	Pre-selected 1131 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000095303', 'ENSG00000171303', 'ENSG00000258551']
	  fbeta: 0.567
	  precision: 0.68
	  recall: 0.34
31 out of 75:
	Degenerative Medullary Fibroblast
	Pre-selected 13 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000085662', 'ENSG00000171345']
	  fbeta: 0.241
	  precision: 0.222
	  recall: 0.36
32 out of 75:
	Degenerative Medullary Thick Ascending Limb Cell
	Pre-selected 127 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000169344', 'ENSG00000119715', 'ENSG00000074803']
	  fbeta: 0.413
	  precision: 0.422
	  recall: 0.38
33 out of 75:
	Degenerative Outer Medullary Collecting Duct Principal Cell
	Pre-selected 769 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000085117', 'ENSG00000166828', 'ENSG00000085563', 'ENSG00000165272']
	  fbeta: 0.508
	  precision: 0.706
	  recall: 0.24
34 out of 75:
	Degenerative Peritubular Capilary Endothelial Cell
	Pre-selected 175 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000102755', 'ENSG00000154217', 'ENSG00000142798']
	  fbeta: 0.32
	  precision: 0.32
	  recall: 0.32
35 out of 75:
	Degenerative Podocyte
	Pre-selected 209 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000166033', 'ENSG00000107742']
	  fbeta: 0.873
	  precision: 1.0
	  recall: 0.58
36 out of 75:
	Degenerative Proximal Tubule Epithelial Cell
	Pre-selected 240 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000259579', 'ENSG00000164626', 'ENSG00000227258']
	  fbeta: 0.688
	  precision: 0.864
	  recall: 0.38
37 out of 75:
	Degenerative Vascular Smooth Muscle Cell
	Pre-selected 536 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000198467', 'ENSG00000198523']
	  fbeta: 0.86
	  precision: 0.941
	  recall: 0.64
38 out of 75:
	Descending Thin Limb Cell Type 1
	Pre-selected 343 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000197301', 'ENSG00000150471']
	  fbeta: 0.556
	  precision: 1.0
	  recall: 0.2
39 out of 75:
	Descending Thin Limb Cell Type 2
	Pre-selected 560 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000235139', 'ENSG00000188883']
	  fbeta: 0.648
	  precision: 0.75
	  recall: 0.42
40 out of 75:
	Descending Thin Limb Cell Type 3
	Pre-selected 745 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000249776', 'ENSG00000145721']
	  fbeta: 0.546
	  precision: 0.613
	  recall: 0.38
41 out of 75:
	Descending Vasa Recta Endothelial Cell
	Pre-selected 519 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000141469', 'ENSG00000116833']
	  fbeta: 0.636
	  precision: 0.933
	  recall: 0.28
42 out of 75:
	Distal Convoluted Tubule Cell Type 1
	Pre-selected 889 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000237422', 'ENSG00000119121']
	  fbeta: 0.675
	  precision: 0.895
	  recall: 0.34
43 out of 75:
	Distal Convoluted Tubule Cell Type 2
	Pre-selected 559 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000165973', 'ENSG00000070915', 'ENSG00000182168']
	  fbeta: 0.549
	  precision: 0.606
	  recall: 0.4
44 out of 75:
	Fibroblast
	Pre-selected 170 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000140092', 'ENSG00000112936', 'ENSG00000116962']
	  fbeta: 0.574
	  precision: 0.778
	  recall: 0.28
45 out of 75:
	Glomerular Capillary Endothelial Cell
	Pre-selected 251 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000167941', 'ENSG00000145708']
	  fbeta: 0.746
	  precision: 0.952
	  recall: 0.4
46 out of 75:
	Inner Medullary Collecting Duct Cell
	Pre-selected 1433 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000228624', 'ENSG00000249853', 'ENSG00000223561']
	  fbeta: 0.682
	  precision: 0.808
	  recall: 0.42
47 out of 75:
	Intercalated Cell Type B
	Pre-selected 362 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000091137', 'ENSG00000188175']
	  fbeta: 0.775
	  precision: 0.957
	  recall: 0.44
48 out of 75:
	Lymphatic Endothelial Cell
	Pre-selected 243 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000138722']
	  fbeta: 0.891
	  precision: 0.947
	  recall: 0.72
49 out of 75:
	M2 Macrophage
	Pre-selected 78 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000260314', 'ENSG00000137491']
	  fbeta: 0.714
	  precision: 0.947
	  recall: 0.36
50 out of 75:
	Macula Densa Cell
	Pre-selected 672 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000139220', 'ENSG00000089250']
	  fbeta: 0.656
	  precision: 0.889
	  recall: 0.32
51 out of 75:
	Mast Cell
	Pre-selected 41 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000185052']
	  fbeta: 0.891
	  precision: 1.0
	  recall: 0.62
52 out of 75:
	Medullary Fibroblast
	Pre-selected 344 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000182256', 'ENSG00000259275']
	  fbeta: 0.704
	  precision: 0.87
	  recall: 0.4
53 out of 75:
	Medullary Thick Ascending Limb Cell
	Pre-selected 1032 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000159261', 'ENSG00000066230', 'ENSG00000117707']
	  fbeta: 0.634
	  precision: 0.81
	  recall: 0.34
54 out of 75:
	Mesangial Cell
	Pre-selected 311 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000188517', 'ENSG00000133816']
	  fbeta: 0.711
	  precision: 0.771
	  recall: 0.54
55 out of 75:
	Monocyte-derived Cell
	Pre-selected 77 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000137462', 'ENSG00000136250']
	  fbeta: 0.369
	  precision: 0.5
	  recall: 0.18
56 out of 75:
	Myofibroblast
	Pre-selected 371 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000198542', 'ENSG00000249669']
	  fbeta: 0.636
	  precision: 0.933
	  recall: 0.28
57 out of 75:
	Natural Killer Cell / Natural Killer T Cell
	Pre-selected 177 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000105374', 'ENSG00000275302']
	  fbeta: 0.78
	  precision: 0.853
	  recall: 0.58
58 out of 75:
	Neutrophil
	Pre-selected 54 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000143546']
	  fbeta: 0.817
	  precision: 0.868
	  recall: 0.66
59 out of 75:
	Non-classical Monocyte
	Pre-selected 128 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000085265', 'ENSG00000028277']
	  fbeta: 0.634
	  precision: 0.81
	  recall: 0.34
60 out of 75:
	Outer Medullary Collecting Duct Intercalated Cell Type A
	Pre-selected 1984 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000253675', 'ENSG00000258881']
	  fbeta: 0.776
	  precision: 0.871
	  recall: 0.54
61 out of 75:
	Outer Medullary Collecting Duct Principal Cell
	Pre-selected 1300 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000254789', 'ENSG00000267659']
	  fbeta: 0.652
	  precision: 0.818
	  recall: 0.36
62 out of 75:
	Papillary Tip Epithelial Cell
	Pre-selected 642 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000174226', 'ENSG00000142973']
	  fbeta: 0.784
	  precision: 1.0
	  recall: 0.42
63 out of 75:
	Parietal Epithelial Cell
	Pre-selected 158 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000112414', 'ENSG00000000971']
	  fbeta: 0.637
	  precision: 1.0
	  recall: 0.26
64 out of 75:
	Peritubular Capilary Endothelial Cell 
	Pre-selected 112 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000163687', 'ENSG00000102755']
	  fbeta: 0.584
	  precision: 0.692
	  recall: 0.36
65 out of 75:
	Plasma Cell
	Pre-selected 86 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000167077', 'ENSG00000183508']
	  fbeta: 0.784
	  precision: 1.0
	  recall: 0.42
66 out of 75:
	Plasmacytoid Dendritic Cell
	Pre-selected 199 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000267337']
	  fbeta: 0.901
	  precision: 1.0
	  recall: 0.645
67 out of 75:
	Podocyte
	Pre-selected 746 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000139304', 'ENSG00000178038']
	  fbeta: 0.902
	  precision: 0.972
	  recall: 0.7
68 out of 75:
	Proximal Tubule Epithelial Cell Segment 1 / Segment 2
	Pre-selected 102 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000171759', 'ENSG00000171766']
	  fbeta: 0.702
	  precision: 1.0
	  recall: 0.32
69 out of 75:
	Proximal Tubule Epithelial Cell Segment 3
	Pre-selected 245 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000158865', 'ENSG00000156222']
	  fbeta: 0.682
	  precision: 1.0
	  recall: 0.3
70 out of 75:
	Renin-positive Juxtaglomerular Granular Cell
	Pre-selected 173 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000143839']
	  fbeta: 0.954
	  precision: 0.943
	  recall: 1.0
71 out of 75:
	Schwann Cell / Neural
	Pre-selected 94 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000179915']
	  fbeta: 0.952
	  precision: 1.0
	  recall: 0.8
72 out of 75:
	T Cell
	Pre-selected 37 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000168685']
	  fbeta: 0.647
	  precision: 0.733
	  recall: 0.44
73 out of 75:
	Transitional Principal-Intercalated Cell
	Pre-selected 1436 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000160951', 'ENSG00000130222', 'ENSG00000259120']
	  fbeta: 0.714
	  precision: 0.947
	  recall: 0.36
74 out of 75:
	Vascular Smooth Muscle Cell
	Pre-selected 242 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000122367', 'ENSG00000156113']
	  fbeta: 0.767
	  precision: 0.92
	  recall: 0.46
75 out of 75:
	Vascular Smooth Muscle Cell / Pericyte
	Pre-selected 89 genes to feed into Random Forest.
	  NSForest-selected markers: ['ENSG00000113721', 'ENSG00000131711']
	  fbeta: 0.5
	  precision: 0.6
	  recall: 0.3
Saving supplementary table as...
outputs_kidney_Lake_2023/subclass.full_supplementary.csv
Saving markers table as...
outputs_kidney_Lake_2023/subclass.full_markers.csv
using median
Calculating medians (means) per cluster: 100%|██████████| 75/75 [00:01<00:00, 59.58it/s]
Saving supplementary table as...
outputs_kidney_Lake_2023/subclass.full_markers_onTarget_supp.csv
Saving supplementary table as...
outputs_kidney_Lake_2023/subclass.full_markers_onTarget.csv
Saving final results table as...
outputs_kidney_Lake_2023/subclass.full_results.csv
Saving final results table as...
outputs_kidney_Lake_2023/subclass.full_results.pkl
--- 345.1476306915283 seconds ---
In [27]:
results
Out[27]:
software_version cluster_header clusterName clusterSize f_score precision recall TN FP FN TP marker_count NSForest_markers binary_genes onTarget
0 4.1 subclass.full Adaptive / Maladaptive / Repairing Fibroblast 50 0.561224 0.916667 0.220000 3515 1 39 11 3 [ENSG00000156218, ENSG00000077942, ENSG0000018... [ENSG00000091986, ENSG00000156218, ENSG0000007... 0.427534
1 4.1 subclass.full Adaptive / Maladaptive / Repairing Proximal Tu... 50 0.633803 0.782609 0.360000 3511 5 32 18 2 [ENSG00000135220, ENSG00000170579] [ENSG00000135220, ENSG00000146592, ENSG0000017... 0.706288
2 4.1 subclass.full Adaptive / Maladaptive / Repairing Thick Ascen... 50 0.615942 0.772727 0.340000 3511 5 33 17 3 [ENSG00000129682, ENSG00000115221, ENSG0000012... [ENSG00000152056, ENSG00000129682, ENSG0000011... 0.273096
3 4.1 subclass.full Afferent / Efferent Arteriole Endothelial Cell 50 0.760870 0.954545 0.420000 3515 1 29 21 2 [ENSG00000135919, ENSG00000131477] [ENSG00000135919, ENSG00000090006, ENSG0000016... 0.778972
4 4.1 subclass.full Ascending Thin Limb Cell 50 0.615942 0.772727 0.340000 3511 5 33 17 2 [ENSG00000184374, ENSG00000275395] [ENSG00000184374, ENSG00000064787, ENSG0000008... 0.772847
5 4.1 subclass.full Ascending Vasa Recta Endothelial Cell 50 0.730769 0.950000 0.380000 3515 1 31 19 2 [ENSG00000148488, ENSG00000283632] [ENSG00000249797, ENSG00000148488, ENSG0000015... 1.000000
6 4.1 subclass.full B Cell 50 0.773196 0.833333 0.600000 3510 6 20 30 1 [ENSG00000156738] [ENSG00000156738, ENSG00000153064, ENSG0000013... 1.000000
7 4.1 subclass.full Classical Dendritic Cell 50 0.857143 0.900000 0.720000 3512 4 14 36 1 [ENSG00000131203] [ENSG00000131203, ENSG00000140090, ENSG0000023... 1.000000
8 4.1 subclass.full Connecting Tubule Cell 50 0.447761 0.571429 0.240000 3507 9 38 12 3 [ENSG00000251504, ENSG00000165685, ENSG0000022... [ENSG00000251504, ENSG00000104327, ENSG0000016... 0.118911
9 4.1 subclass.full Connecting Tubule Intercalated Cell Type A 50 0.324675 0.384615 0.200000 3500 16 40 10 2 [ENSG00000112530, ENSG00000145147] [ENSG00000185274, ENSG00000154678, ENSG0000011... 0.240039
10 4.1 subclass.full Connecting Tubule Principal Cell 50 0.555556 0.736842 0.280000 3511 5 36 14 3 [ENSG00000159167, ENSG00000104327, ENSG0000020... [ENSG00000006327, ENSG00000159167, ENSG0000010... 0.408436
11 4.1 subclass.full Cortical Collecting Duct Intercalated Cell Type A 50 0.590909 0.866667 0.260000 3514 2 37 13 3 [ENSG00000278961, ENSG00000185274, ENSG0000014... [ENSG00000278961, ENSG00000144227, ENSG0000018... 0.373059
12 4.1 subclass.full Cortical Collecting Duct Principal Cell 50 0.563380 0.695652 0.320000 3509 7 34 16 2 [ENSG00000182752, ENSG00000184672] [ENSG00000169071, ENSG00000182752, ENSG0000025... 0.319824
13 4.1 subclass.full Cortical Thick Ascending Limb Cell 50 0.533708 0.593750 0.380000 3503 13 31 19 2 [ENSG00000113946, ENSG00000036672] [ENSG00000169347, ENSG00000113946, ENSG0000003... 1.000000
14 4.1 subclass.full Cycling Connecting Tubule Cell 12 0.340909 0.375000 0.250000 3549 5 9 3 2 [ENSG00000102870, ENSG00000158805] [ENSG00000102870, ENSG00000138587, ENSG0000015... 1.000000
15 4.1 subclass.full Cycling Distal Convoluted Tubule Cell 6 0.714286 1.000000 0.333333 3560 0 4 2 2 [ENSG00000058404, ENSG00000066279] [ENSG00000105928, ENSG00000058404, ENSG0000012... 1.000000
16 4.1 subclass.full Cycling Endothelial Cell 50 0.570175 0.812500 0.260000 3513 3 37 13 2 [ENSG00000178878, ENSG00000139734] [ENSG00000106462, ENSG00000178878, ENSG0000011... 0.711106
17 4.1 subclass.full Cycling Mononuclear Phagocyte 48 0.375000 0.750000 0.125000 3516 2 42 6 3 [ENSG00000077420, ENSG00000101639, ENSG0000018... [ENSG00000170017, ENSG00000077420, ENSG0000014... 0.242716
18 4.1 subclass.full Cycling Myofibroblast 18 0.760870 1.000000 0.388889 3548 0 11 7 2 [ENSG00000143476, ENSG00000152402] [ENSG00000136492, ENSG00000123219, ENSG0000011... 0.779364
19 4.1 subclass.full Cycling Natural Killer Cell / Natural Killer T... 16 0.892857 1.000000 0.625000 3550 0 6 10 2 [ENSG00000166803, ENSG00000183918] [ENSG00000166803, ENSG00000164104, ENSG0000018... 1.000000
20 4.1 subclass.full Cycling Proximal Tubule Epithelial Cell 50 0.353982 0.363636 0.320000 3488 28 34 16 2 [ENSG00000165304, ENSG00000132780] [ENSG00000165304, ENSG00000132780, ENSG0000012... 0.191920
21 4.1 subclass.full Degenerative Ascending Thin Limb Cell 50 0.592784 0.638889 0.460000 3503 13 27 23 2 [ENSG00000197249, ENSG00000140600] [ENSG00000197249, ENSG00000275395, ENSG0000014... 0.437558
22 4.1 subclass.full Degenerative Connecting Tubule Cell 50 0.500000 0.900000 0.180000 3515 1 41 9 3 [ENSG00000162896, ENSG00000145536, ENSG0000010... [ENSG00000162896, ENSG00000145536, ENSG0000010... 0.285822
23 4.1 subclass.full Degenerative Cortical Intercalated Cell Type A 50 0.718391 0.806452 0.500000 3510 6 25 25 2 [ENSG00000130203, ENSG00000151418] [ENSG00000213185, ENSG00000152931, ENSG0000013... 0.437250
24 4.1 subclass.full Degenerative Cortical Thick Ascending Limb Cell 50 0.635593 0.882353 0.300000 3514 2 35 15 2 [ENSG00000198431, ENSG00000124107] [ENSG00000203907, ENSG00000135931, ENSG0000019... 0.235051
25 4.1 subclass.full Degenerative Descending Thin Limb Cell Type 3 50 0.786517 0.875000 0.560000 3512 4 22 28 2 [ENSG00000133475, ENSG00000115641] [ENSG00000019186, ENSG00000187957, ENSG0000013... 0.940297
26 4.1 subclass.full Degenerative Distal Convoluted Tubule Cell 50 0.741758 0.818182 0.540000 3510 6 23 27 2 [ENSG00000121769, ENSG00000178343] [ENSG00000166426, ENSG00000121769, ENSG0000018... 1.000000
27 4.1 subclass.full Degenerative Endothelial Cell 50 0.425532 0.727273 0.160000 3513 3 42 8 2 [ENSG00000142089, ENSG00000184831] [ENSG00000142089, ENSG00000130300, ENSG0000018... 0.212317
28 4.1 subclass.full Degenerative Fibroblast 50 0.811688 0.961538 0.500000 3515 1 25 25 2 [ENSG00000139329, ENSG00000142173] [ENSG00000139329, ENSG00000159403, ENSG0000010... 0.930817
29 4.1 subclass.full Degenerative Inner Medullary Collecting Duct Cell 50 0.566667 0.680000 0.340000 3508 8 33 17 3 [ENSG00000095303, ENSG00000171303, ENSG0000025... [ENSG00000095303, ENSG00000171303, ENSG0000022... 0.629699
30 4.1 subclass.full Degenerative Medullary Fibroblast 50 0.240642 0.222222 0.360000 3453 63 32 18 2 [ENSG00000085662, ENSG00000171345] [ENSG00000109846, ENSG00000085662, ENSG0000012... 0.118473
31 4.1 subclass.full Degenerative Medullary Thick Ascending Limb Cell 50 0.413043 0.422222 0.380000 3490 26 31 19 3 [ENSG00000169344, ENSG00000119715, ENSG0000007... [ENSG00000169344, ENSG00000119715, ENSG0000007... 0.163110
32 4.1 subclass.full Degenerative Outer Medullary Collecting Duct P... 50 0.508475 0.705882 0.240000 3511 5 38 12 4 [ENSG00000085117, ENSG00000166828, ENSG0000008... [ENSG00000160951, ENSG00000159167, ENSG0000008... 0.229297
33 4.1 subclass.full Degenerative Peritubular Capilary Endothelial ... 50 0.320000 0.320000 0.320000 3482 34 34 16 3 [ENSG00000102755, ENSG00000154217, ENSG0000014... [ENSG00000102755, ENSG00000127329, ENSG0000026... 0.111280
34 4.1 subclass.full Degenerative Podocyte 50 0.873494 1.000000 0.580000 3516 0 21 29 2 [ENSG00000166033, ENSG00000107742] [ENSG00000159713, ENSG00000166033, ENSG0000010... 0.685713
35 4.1 subclass.full Degenerative Proximal Tubule Epithelial Cell 50 0.688406 0.863636 0.380000 3513 3 31 19 3 [ENSG00000259579, ENSG00000164626, ENSG0000022... [ENSG00000150275, ENSG00000259579, ENSG0000025... 1.000000
36 4.1 subclass.full Degenerative Vascular Smooth Muscle Cell 50 0.860215 0.941176 0.640000 3514 2 18 32 2 [ENSG00000198467, ENSG00000198523] [ENSG00000198467, ENSG00000198523, ENSG0000017... 1.000000
37 4.1 subclass.full Descending Thin Limb Cell Type 1 50 0.555556 1.000000 0.200000 3516 0 40 10 2 [ENSG00000197301, ENSG00000150471] [ENSG00000197301, ENSG00000228412, ENSG0000011... 0.645970
38 4.1 subclass.full Descending Thin Limb Cell Type 2 50 0.648148 0.750000 0.420000 3509 7 29 21 2 [ENSG00000235139, ENSG00000188883] [ENSG00000156687, ENSG00000235139, ENSG0000022... 0.737245
39 4.1 subclass.full Descending Thin Limb Cell Type 3 50 0.545977 0.612903 0.380000 3504 12 31 19 2 [ENSG00000249776, ENSG00000145721] [ENSG00000249776, ENSG00000233611, ENSG0000014... 1.000000
40 4.1 subclass.full Descending Vasa Recta Endothelial Cell 50 0.636364 0.933333 0.280000 3515 1 36 14 2 [ENSG00000141469, ENSG00000116833] [ENSG00000136960, ENSG00000141469, ENSG0000011... 1.000000
41 4.1 subclass.full Distal Convoluted Tubule Cell Type 1 50 0.674603 0.894737 0.340000 3514 2 33 17 2 [ENSG00000237422, ENSG00000119121] [ENSG00000237422, ENSG00000140470, ENSG0000024... 0.694848
42 4.1 subclass.full Distal Convoluted Tubule Cell Type 2 50 0.549451 0.606061 0.400000 3503 13 30 20 3 [ENSG00000165973, ENSG00000070915, ENSG0000018... [ENSG00000165973, ENSG00000146021, ENSG0000011... 0.262543
43 4.1 subclass.full Fibroblast 50 0.573770 0.777778 0.280000 3512 4 36 14 3 [ENSG00000140092, ENSG00000112936, ENSG0000011... [ENSG00000262655, ENSG00000154262, ENSG0000014... 0.330656
44 4.1 subclass.full Glomerular Capillary Endothelial Cell 50 0.746269 0.952381 0.400000 3515 1 30 20 2 [ENSG00000167941, ENSG00000145708] [ENSG00000167941, ENSG00000013016, ENSG0000014... 1.000000
45 4.1 subclass.full Inner Medullary Collecting Duct Cell 50 0.681818 0.807692 0.420000 3511 5 29 21 3 [ENSG00000228624, ENSG00000249853, ENSG0000022... [ENSG00000228624, ENSG00000249853, ENSG0000013... 0.700192
46 4.1 subclass.full Intercalated Cell Type B 50 0.774648 0.956522 0.440000 3515 1 28 22 2 [ENSG00000091137, ENSG00000188175] [ENSG00000091137, ENSG00000027644, ENSG0000012... 0.909313
47 4.1 subclass.full Lymphatic Endothelial Cell 50 0.891089 0.947368 0.720000 3514 2 14 36 1 [ENSG00000138722] [ENSG00000138722, ENSG00000184058, ENSG0000020... 1.000000
48 4.1 subclass.full M2 Macrophage 50 0.714286 0.947368 0.360000 3515 1 32 18 2 [ENSG00000260314, ENSG00000137491] [ENSG00000177575, ENSG00000260314, ENSG0000013... 1.000000
49 4.1 subclass.full Macula Densa Cell 50 0.655738 0.888889 0.320000 3514 2 34 16 2 [ENSG00000139220, ENSG00000089250] [ENSG00000116183, ENSG00000091128, ENSG0000013... 0.919283
50 4.1 subclass.full Mast Cell 50 0.890805 1.000000 0.620000 3516 0 19 31 1 [ENSG00000185052] [ENSG00000163751, ENSG00000197253, ENSG0000010... 0.799457
51 4.1 subclass.full Medullary Fibroblast 50 0.704225 0.869565 0.400000 3513 3 30 20 2 [ENSG00000182256, ENSG00000259275] [ENSG00000182256, ENSG00000079931, ENSG0000006... 1.000000
52 4.1 subclass.full Medullary Thick Ascending Limb Cell 50 0.634328 0.809524 0.340000 3512 4 33 17 3 [ENSG00000159261, ENSG00000066230, ENSG0000011... [ENSG00000159261, ENSG00000066230, ENSG0000028... 1.000000
53 4.1 subclass.full Mesangial Cell 50 0.710526 0.771429 0.540000 3508 8 23 27 2 [ENSG00000188517, ENSG00000133816] [ENSG00000144891, ENSG00000154864, ENSG0000013... 0.532527
54 4.1 subclass.full Monocyte-derived Cell 50 0.368852 0.500000 0.180000 3507 9 41 9 2 [ENSG00000137462, ENSG00000136250] [ENSG00000137462, ENSG00000119900, ENSG0000010... 0.692946
55 4.1 subclass.full Myofibroblast 50 0.636364 0.933333 0.280000 3515 1 36 14 2 [ENSG00000198542, ENSG00000249669] [ENSG00000198542, ENSG00000152402, ENSG0000010... 0.669407
56 4.1 subclass.full Natural Killer Cell / Natural Killer T Cell 50 0.779570 0.852941 0.580000 3511 5 21 29 2 [ENSG00000105374, ENSG00000275302] [ENSG00000180644, ENSG00000105374, ENSG0000011... 1.000000
57 4.1 subclass.full Neutrophil 50 0.816832 0.868421 0.660000 3511 5 17 33 1 [ENSG00000143546] [ENSG00000143546, ENSG00000163220, ENSG0000005... 1.000000
58 4.1 subclass.full Non-classical Monocyte 50 0.634328 0.809524 0.340000 3512 4 33 17 2 [ENSG00000085265, ENSG00000028277] [ENSG00000085265, ENSG00000204482, ENSG0000001... 1.000000
59 4.1 subclass.full Outer Medullary Collecting Duct Intercalated C... 50 0.775862 0.870968 0.540000 3512 4 23 27 2 [ENSG00000253675, ENSG00000258881] [ENSG00000253675, ENSG00000106302, ENSG0000023... 1.000000
60 4.1 subclass.full Outer Medullary Collecting Duct Principal Cell 50 0.652174 0.818182 0.360000 3512 4 32 18 2 [ENSG00000254789, ENSG00000267659] [ENSG00000254789, ENSG00000254695, ENSG0000016... 0.710974
61 4.1 subclass.full Papillary Tip Epithelial Cell 50 0.783582 1.000000 0.420000 3516 0 29 21 2 [ENSG00000174226, ENSG00000142973] [ENSG00000171401, ENSG00000174226, ENSG0000014... 1.000000
62 4.1 subclass.full Parietal Epithelial Cell 50 0.637255 1.000000 0.260000 3516 0 37 13 2 [ENSG00000112414, ENSG00000000971] [ENSG00000162692, ENSG00000112414, ENSG0000000... 0.868456
63 4.1 subclass.full Peritubular Capilary Endothelial Cell 50 0.584416 0.692308 0.360000 3508 8 32 18 2 [ENSG00000163687, ENSG00000102755] [ENSG00000163687, ENSG00000168497, ENSG0000012... 0.592903
64 4.1 subclass.full Plasma Cell 50 0.783582 1.000000 0.420000 3516 0 29 21 2 [ENSG00000167077, ENSG00000183508] [ENSG00000170476, ENSG00000167077, ENSG0000018... 1.000000
65 4.1 subclass.full Plasmacytoid Dendritic Cell 31 0.900901 1.000000 0.645161 3535 0 11 20 1 [ENSG00000267337] [ENSG00000267337, ENSG00000111249, ENSG0000019... 1.000000
66 4.1 subclass.full Podocyte 50 0.902062 0.972222 0.700000 3515 1 15 35 2 [ENSG00000139304, ENSG00000178038] [ENSG00000139304, ENSG00000155816, ENSG0000014... 1.000000
67 4.1 subclass.full Proximal Tubule Epithelial Cell Segment 1 / Se... 50 0.701754 1.000000 0.320000 3516 0 34 16 2 [ENSG00000171759, ENSG00000171766] [ENSG00000149452, ENSG00000250799, ENSG0000017... 0.680535
68 4.1 subclass.full Proximal Tubule Epithelial Cell Segment 3 50 0.681818 1.000000 0.300000 3516 0 35 15 2 [ENSG00000158865, ENSG00000156222] [ENSG00000158865, ENSG00000154025, ENSG0000022... 0.957749
69 4.1 subclass.full Renin-positive Juxtaglomerular Granular Cell 50 0.954198 0.943396 1.000000 3513 3 0 50 1 [ENSG00000143839] [ENSG00000143839, ENSG00000152208, ENSG0000010... 1.000000
70 4.1 subclass.full Schwann Cell / Neural 35 0.952381 1.000000 0.800000 3531 0 7 28 1 [ENSG00000179915] [ENSG00000179915, ENSG00000175161, ENSG0000007... 1.000000
71 4.1 subclass.full T Cell 50 0.647059 0.733333 0.440000 3508 8 28 22 1 [ENSG00000168685] [ENSG00000168685, ENSG00000153283, ENSG0000017... 1.000000
72 4.1 subclass.full Transitional Principal-Intercalated Cell 50 0.714286 0.947368 0.360000 3515 1 32 18 3 [ENSG00000160951, ENSG00000130222, ENSG0000025... [ENSG00000160951, ENSG00000130222, ENSG0000025... 0.311119
73 4.1 subclass.full Vascular Smooth Muscle Cell 50 0.766667 0.920000 0.460000 3514 2 27 23 2 [ENSG00000122367, ENSG00000156113] [ENSG00000122367, ENSG00000156113, ENSG0000016... 1.000000
74 4.1 subclass.full Vascular Smooth Muscle Cell / Pericyte 50 0.500000 0.600000 0.300000 3506 10 35 15 2 [ENSG00000113721, ENSG00000131711] [ENSG00000138031, ENSG00000113721, ENSG0000013... 0.277395

4. Plotting¶

load NS-Forest results (copy set up and load pkl)¶

In [28]:
# ## set up
# organ = "kidney" #<---
# author = "Lake" #<---
# year = "2023" #<---
# output_folder = "outputs_" + organ + "_" + author + "_" + year + "/" #e.g., "outputs_kidney_Lake_2023/"

# cluster_header = "subclass.full" #<---
# outputfilename_suffix = cluster_header
# outputfilename_prefix = cluster_header 

# ## load NS-Forest results
# results = pd.read_pickle(output_folder + cluster_header + "_results.pkl")
In [29]:
## set results to plot
results_to_plot = results

boxplots¶

In [30]:
ns.pl.boxplot(results_to_plot, "f_score", save = "html", output_folder = output_folder, outputfilename_prefix = outputfilename_prefix)
Saving...
 outputs_kidney_Lake_2023/subclass.full_boxplot_f_score.html
In [31]:
ns.pl.boxplot(results_to_plot, "precision", save = "html", output_folder = output_folder, outputfilename_prefix = outputfilename_prefix)
Saving...
 outputs_kidney_Lake_2023/subclass.full_boxplot_precision.html
In [32]:
ns.pl.boxplot(results_to_plot, "recall", save = "html", output_folder = output_folder, outputfilename_prefix = outputfilename_prefix)
Saving...
 outputs_kidney_Lake_2023/subclass.full_boxplot_recall.html
In [33]:
ns.pl.boxplot(results_to_plot, "onTarget", save = "html", output_folder = output_folder, outputfilename_prefix = outputfilename_prefix)
Saving...
 outputs_kidney_Lake_2023/subclass.full_boxplot_onTarget.html

scatter plots w.r.t. cluster size¶

In [34]:
ns.pl.scatter_w_clusterSize(results, "f_score", save = True, output_folder = output_folder, outputfilename_prefix = outputfilename_prefix)
Saving...
 outputs_kidney_Lake_2023/subclass.full_scatter_f_score.html
In [35]:
ns.pl.scatter_w_clusterSize(results, "precision", save = True, output_folder = output_folder, outputfilename_prefix = outputfilename_prefix)
Saving...
 outputs_kidney_Lake_2023/subclass.full_scatter_precision.html
In [36]:
ns.pl.scatter_w_clusterSize(results, "recall", save = True, output_folder = output_folder, outputfilename_prefix = outputfilename_prefix)
Saving...
 outputs_kidney_Lake_2023/subclass.full_scatter_recall.html
In [37]:
ns.pl.scatter_w_clusterSize(results, "onTarget", save = True, output_folder = output_folder, outputfilename_prefix = outputfilename_prefix)
Saving...
 outputs_kidney_Lake_2023/subclass.full_scatter_onTarget.html
In [38]:
## Download gene mapping utilities from cell-kn (one-time)
import urllib.request
import os

if not os.path.exists('gene_mapping_utils.py'):
    print("Downloading gene_mapping_utils.py from cell-kn...")
    url = "https://raw.githubusercontent.com/NIH-NLM/cell-kn/main/utils/gene_mapping_utils.py"
    urllib.request.urlretrieve(url, "gene_mapping_utils.py")
    print("Downloaded!")

# %%
## Import and use
from gene_mapping_utils import load_gene_mapping, create_mapping_dict

# Load mapping (cached after first run)
gene_mapping = load_gene_mapping()
ensg_to_symbol = create_mapping_dict(gene_mapping)
print(f"Loaded {len(gene_mapping)} gene mappings")

# Map ENSG IDs to gene names (FAST)
results_to_plot['gene_names'] = [
    [ensg_to_symbol.get(gene, gene) for gene in markers]
    for markers in results_to_plot['NSForest_markers']
]

print(f"\nMapped markers for {len(results_to_plot)} clusters")

# Create markers_dict
markers_dict = dict(zip(results_to_plot["clusterName"], 
                       results_to_plot["gene_names"]))

print(f"markers_dict created with {len(markers_dict)} clusters")
Loading gene mapping from gene_mapping.csv
Loaded 34460 gene mappings

Mapped markers for 75 clusters
markers_dict created with 75 clusters
In [39]:
## Add gene symbol annotations to adata
# Add gene_symbol column to adata.var
adata.var['gene_symbol'] = [ensg_to_symbol.get(gene, gene) for gene in adata.var_names]
In [40]:
ns.pl.dotplot(adata, 
              markers_dict, 
              cluster_header, 
              dendrogram=True, 
              use_raw=False,
              gene_symbols='gene_symbol',  # Display symbols instead of ENSG
              save="svg", 
              output_folder=output_folder, 
              outputfilename_suffix=outputfilename_prefix)
WARNING: saving figure to file outputs_kidney_Lake_2023/dotplot_subclass.full.svg
No description has been provided for this image
In [41]:
ns.pl.dotplot(adata, 
              markers_dict, 
              cluster_header, 
              dendrogram = True, 
              use_raw = False, 
              gene_symbols='gene_symbol',  # Display symbols instead of ENSG
              standard_scale = 'var', 
              save = "svg", 
              output_folder = output_folder, 
              outputfilename_suffix = outputfilename_prefix + "_scaled")
WARNING: saving figure to file outputs_kidney_Lake_2023/dotplot_subclass.full_scaled.svg
No description has been provided for this image
In [42]:
ns.pl.stackedviolin(adata, 
                    markers_dict, 
                    cluster_header, 
                    dendrogram = True, 
                    use_raw = False,
                    gene_symbols='gene_symbol',  # Display symbols instead of ENSG
                    save = "svg", 
                    output_folder = output_folder, 
                    outputfilename_suffix = outputfilename_prefix)
WARNING: saving figure to file outputs_kidney_Lake_2023/stacked_violin_subclass.full.svg
No description has been provided for this image
In [ ]:
ns.pl.stackedviolin(adata, 
                    markers_dict, 
                    cluster_header, 
                    dendrogram = True, 
                    use_raw = False,
                    gene_symbols='gene_symbol',  # Display symbols instead of ENSG
                    standard_scale = 'var',
                    save = "svg", 
                    output_folder = output_folder, 
                    outputfilename_suffix = outputfilename_prefix + "_scaled")
In [ ]:
ns.pl.matrixplot(adata, 
                 markers_dict, 
                 cluster_header, 
                 dendrogram = True, 
                 use_raw = False,
                 gene_symbols='gene_symbol',  # Display symbols instead of ENSG
                 save = "svg", 
                 output_folder = output_folder, 
                 outputfilename_suffix = outputfilename_prefix)
In [ ]:
ns.pl.matrixplot(adata, 
                 markers_dict, 
                 cluster_header, 
                 dendrogram = True, 
                 use_raw = False, 
                 gene_symbols='gene_symbol',  # Display symbols instead of ENSG
                 standard_scale = 'var',
                 save = "svg", 
                 output_folder = output_folder, 
                 outputfilename_suffix = outputfilename_prefix + "_scaled")

Save¶

count saved items in the output folder¶

In [ ]:
from pathlib import Path
folder_path = Path(output_folder) 
item_count = len(list(folder_path.iterdir()))
print(f"Total items in the output folder: {item_count}")

saving html report¶

In [ ]:
## for interactive plot
import plotly
plotly.offline.init_notebook_mode()
In [ ]:
## save html report
!jupyter nbconvert --to html DEMO_NS-Forest_workflow.ipynb #<---